library(tidyverse)
library(anytime)
library(gganimate)
library(gifski)
library(ggthemes)
library(sf)
library(transformr)
library(ggrepel)
This project is based on R language and Uber-Lyft dataset on Kaggle(https://www.kaggle.com/datasets/ravi72munde/uber-lyft-cab-prices).
#change to your filepath when use
#import data set in dataframe format and do a basic filter.
df_origin<-read.csv("/Users/guangjitang/Downloads/uber data/cab_rides.csv")
df <- df_origin %>% filter(!is.na(price))
weather <- read.csv("/Users/guangjitang/Downloads/uber data/weather.csv")
load("/Users/guangjitang/test1/map.rda")
#get the middle spot of the polygon
for (i in 1:length(map$Name)){
g=map$geometry[[i]]
g=g[[1]]
map$long_mid[i]=mean(g[,1])
map$lat_mid[i]=mean(g[,2])
}
# get the same name with df for future join
u=unique(df$source)
map$nname=NA
count_name=0
for (i in 1:length(map$Name)) {
for (j in 1:length(u)){
if (grepl(u[j],map$Name[i],ignore.case = T)){
map$nname[i]=u[j]
count_name=count_name+1
}
}
}
count_name
## [1] 11
#only 11 name was added sucessfully, add the last one manully
map$nname[6]="Haymarket Square"
#map2 saves the places of interest
map2 <- map %>%
filter(!is.na(nname)) %>%
select(nname,geometry,long_mid,lat_mid)
df_sample <- df #%>% sample_n(10000)# Sample for test only
df_plot <- df %>%
mutate(location=source)%>% # If care only about the weather of the source
mutate(time=anytime(time_stamp/1000))%>% #convert time stamp to time
mutate(time_hour=substr(time,1,10))%>% # select only the hours
mutate(hour = as.numeric(substr(time,12,13)))
df_plot$part_of_time = "night"
df_plot$part_duration = 8
df_plot$part_of_time[df_plot$hour>5] = "morning"
df_plot$part_duration[df_plot$hour>5] = 5
df_plot$part_of_time[df_plot$hour>10] = "noon"
df_plot$part_duration[df_plot$hour>10] = 3
df_plot$part_of_time[df_plot$hour>13] = "afternoon"
df_plot$part_duration[df_plot$hour>13] = 5
df_plot$part_of_time[df_plot$hour>18] = "evening"
df_plot$part_duration[df_plot$hour>18] = 3
df_plot$part_of_time[df_plot$hour>21] = "night"
df_plot$part_duration[df_plot$hour>21] = 8
df_plot$nid=seq(1,length(df_plot$id))
weather <- weather %>%
mutate(time=anytime(time_stamp)) %>% #convert time stamp to time
mutate(time_hour=substr(time,1,10)) # select only the hours
#df_plot <- merge(df_plot,weather,by=c("time_hour","location"))
df_plot <- inner_join(df_plot,weather,by=c("time_hour","location"))
df_plot <- df_plot %>%
mutate(g_time=time.x-time.y) %>%
mutate(g_time=abs(g_time))
df_plot <- df_plot %>%
group_by(nid) %>%
arrange(g_time, .by_group = TRUE) %>%
top_n(1, g_time) %>% # now connect only to the most recent weather data
select(-g_time)
df_plot <- df_plot %>%
mutate(rainy=!is.na(rain))
save(df_plot,file = "df_plot.rda")
load(file ="df_plot.rda")
df_price <- df_plot %>% filter(distance>0.5) %>% mutate(avg_price = price/distance)
# df_price is prepared for those analysis considering average price by distance(avg_price = price/distance). Cab rides whose distance under 0.5 mile is not included since the avg_price will be extremely large and meaningless.
ggplot(df_plot,aes(distance,price,color=rainy))+facet_wrap("cab_type")+geom_point(size=1,alpha=0.3)+geom_smooth()+theme_economist()+scale_color_manual(values = c("#ffcc61","blue"))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plot2=ggplot() +
geom_sf(data = map, mapping = aes(fill = "Rest"),colour="grey",fill="white", show.legend = FALSE)+
geom_sf(data = map2, mapping = aes(fill = nname),na.rm = T, show.legend = T)+
coord_sf(xlim= c(-71.14,-71.03),ylim = c(42.31,42.39),lims_method="cross")+
theme_map() +
theme(legend.position = c(0.95, 0),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=10)) +
labs(title = "Areas",
fill="Area name")
plot2
df_plot3 = df_plot %>% group_by(source) %>% count(source)
df_plot3 = df_plot3 %>% rename(source_n = n,nname=source)
df_plot3_temp = df_plot %>% group_by(destination) %>% count(destination)
df_plot3$destination_n = df_plot3_temp$n
df_plot3$difference = df_plot3$source_n -df_plot3$destination_n
map_plot3 = merge(map2,df_plot3,by="nname")
plot3 = plot2 +
geom_point(data = map_plot3,mapping = aes(x=long_mid,y=lat_mid,color = difference))+
scale_color_gradient(low = "cyan",high = "red")
plot4 = ggplot() +
geom_sf(data = map, mapping = aes(fill = "Rest"),colour="grey",fill="white", show.legend = FALSE)+
geom_sf(data = map_plot3, mapping = aes(fill = difference))+
scale_fill_gradientn(values = c(1,0.5,0), colours = c('cyan','white','red'))+
coord_sf(xlim= c(-71.14,-71.03),ylim = c(42.31,42.39),lims_method="cross")+
theme_map() +
theme(legend.position = c(0.95, 0),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=10))
plot4
#difference by part_of_time
df_plot3_2 = df_plot %>% group_by(source,part_of_time) %>% summarise(n=sum(1/part_duration))
## `summarise()` has grouped output by 'source'. You can override using the
## `.groups` argument.
df_plot3_2 = df_plot3_2 %>% rename(source_n = n,nname=source)
df_plot3_2_temp = df_plot %>% group_by(destination,part_of_time) %>% summarise(n=sum(1/part_duration))
## `summarise()` has grouped output by 'destination'. You can override using the
## `.groups` argument.
df_plot3_2$destination_n = df_plot3_2_temp$n
df_plot3_2$difference = df_plot3_2$source_n -df_plot3_2$destination_n
map_plot3_2 = merge(map2,df_plot3_2,by="nname")
plot2 + facet_grid(~part_of_time)+
geom_point(data = map_plot3_2,mapping = aes(x=long_mid,y=lat_mid,color = difference))+
scale_color_gradient(low = "cyan",high = "red")+
labs(title = "Cab out-in in Boston area by part of time in a day",
caption = "Cyan for out>in",fill="Out - in")
#Out-in difference by hour and source
df_Hour_x_Source = df_plot %>% group_by(source,hour) %>% summarise(n=sum(1/part_duration),avg_price=mean(price/distance))
## `summarise()` has grouped output by 'source'. You can override using the
## `.groups` argument.
df_Hour_x_Source = df_Hour_x_Source %>% rename(source_n = n,nname=source)
df_Hour_x_Source_temp = df_plot %>% group_by(destination,hour) %>% summarise(n=sum(1/part_duration))
## `summarise()` has grouped output by 'destination'. You can override using the
## `.groups` argument.
df_Hour_x_Source$destination_n = df_Hour_x_Source_temp$n
df_Hour_x_Source$difference = df_Hour_x_Source$source_n -df_Hour_x_Source$destination_n
map_Hour_x_Source = merge(map2,df_Hour_x_Source,by="nname")
makeplot <- function(){
datalist <- split(map_Hour_x_Source, map_Hour_x_Source$hour)
#add overlap
for(i in 0:23){
datalist[[i+1]]$difference=0.75*datalist[[(i-1)%%24+1]]$difference+datalist[[i+1]]$difference+0.75*datalist[[(i+1)%%24+1]]$difference
}
x=lapply(datalist, function(data){
p <- ggplot() +
geom_sf(data = map, mapping = aes(fill = "Rest"),colour="grey",fill="white", show.legend = FALSE)+
geom_sf(data = data, mapping = aes(fill = difference))+
geom_text_repel(data=data,mapping=aes(long_mid,lat_mid,label=nname))+
scale_fill_gradientn(values = c(1,0.5,0), colours = c('cyan','white','red'))+
coord_sf(xlim= c(-71.14,-71.03),ylim = c(42.31,42.39),lims_method="cross")+
theme_map() +
theme(legend.position = c(0.95, 0),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=10)) +
labs(title = "Cab out-in in Boston area",
caption = "In and out",fill="Out - in")+
labs(subtitle = paste("Hour: ",data$hour[1]))
print(p)
})
}
gif_file <- "gif_1.gif"
save_gif(makeplot(), gif_file, 1280, 720, delay=0.5)
## [1] "/Users/guangjitang/test1/gif_1.gif"
knitr::include_graphics(gif_file)
#price and distance by hour
df_plot_sample<- df_plot %>% filter(nid%%10 == 0)
ggplot(df_plot,aes(distance,price,color=cab_type))+
transition_time(hour)+
geom_point(data=df_plot_sample,size=1,alpha=0.5)+
geom_smooth()+
coord_cartesian(ylim = c(0,50))+
labs(title = "Cab out-in in Boston area",
caption = "In and out",fill="Out - in")+
labs(subtitle = "Hour: {frame_time}")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'